Age of required CRAN packages


In [5]:
import pandas
from matplotlib import pyplot as plt
from matplotlib_venn import venn3, venn2

%matplotlib inline
from IPython.display import set_matplotlib_formats
#set_matplotlib_formats('pdf')

cran_release = pandas.DataFrame.from_csv('../data/cran-packages-150601.csv', index_col=None)
data = pandas.DataFrame.from_csv('../data/github-cran-bioc-alldata-150420.csv', index_col=None)

In [6]:
R_packages = ('R MASS Matrix base boot class cluster codetools compiler datasets foreign grDevices ' +
        'graphics grid lattice methods mgcv nlme nnet parallel rpart ' +
        'spatial splines stats stats4 survival tcltk tools translations utils').split(' ')

In [7]:
cran_release = cran_release.sort('mtime').drop_duplicates('package', take_last=False).rename(columns={'package': 'Package'})[['Package', 'mtime']]

In [8]:
data = data.query('Source == "cran" or Source == "github"').sort('Date').drop_duplicates(('Package', 'Source'), take_last=True)[['Package', 'Version', 'Source', 'Date', 'Depends', 'Imports']]
data = data.fillna('')

In [9]:
packages = {}

for idx, row in data.iterrows():
    package = packages.setdefault(row['Package'], {})
    deps = [x.strip() for x in row['Depends'].split(' ') + row['Imports'].split(' ') if len(x.strip())>0]
    package[row['Source']] = [x for x in deps if x not in R_packages]

In [10]:
cran_required = {'github': set(), 'cran': set()}

for name, package in packages.iteritems():
    for source, deps in package.iteritems():
        for dep in deps: 
            if packages.get(dep, {}).get('cran', None) is not None:
                cran_required[source].add(dep)

In [11]:
venn2((cran_required['github'], cran_required['cran']), ('github', 'cran'))


Out[11]:
<matplotlib_venn._common.VennDiagram instance at 0x7fa27ce9de60>

In [12]:
required = data.query('Source == "cran"')[['Package', 'Date']]
required = required.merge(cran_release, on='Package', how='left').set_index('Package')

In [13]:
required['GitHub'] = required['CRAN'] = required['GitHubOnly'] = required['CRANOnly'] = required['Both'] = pandas.np.nan

In [14]:
for name in cran_required['github']:
    required.loc[name, 'GitHub'] = 1
for name in cran_required['cran']:
    required.loc[name, 'CRAN'] = 1

In [15]:
required[:10]


Out[15]:
Date mtime GitHub CRAN GitHubOnly CRANOnly Both
Package
A3 2013-09-03 00:00:00 2013-02-07 10:00:29 NaN NaN NaN NaN NaN
ABCExtremes 2013-09-03 00:00:00 2013-05-15 10:45:56 NaN NaN NaN NaN NaN
ABCp2 2013-09-03 00:00:00 2013-04-10 17:04:22 NaN NaN NaN NaN NaN
ACCLMA 2013-09-03 00:00:00 2012-10-29 13:13:35 NaN NaN NaN NaN NaN
ADGofTest 2013-09-03 00:00:00 2009-07-18 17:21:36 1 1 NaN NaN NaN
AIM 2013-09-03 00:00:00 2010-04-05 21:01:23 NaN NaN NaN NaN NaN
ALS 2013-09-03 00:00:00 2008-08-06 19:06:23 1 NaN NaN NaN NaN
AMAP.Seq 2013-09-03 00:00:00 2012-06-19 16:55:48 NaN NaN NaN NaN NaN
AMGET 2013-09-03 00:00:00 2013-08-02 14:19:55 NaN NaN NaN NaN NaN
ANN 2013-09-03 00:00:00 2011-10-20 10:46:23 NaN NaN NaN NaN NaN

In [16]:
def __F(row):
    if row['GitHub'] == 1 and row['CRAN'] == 1:
        row['Both'] = 1
    else:
        if row['GitHub'] == 1:
            row['GitHubOnly'] = 1
        elif row['CRAN'] == 1:
            row['CRANOnly'] = 1
    return row

In [17]:
fields = ['GitHub', 'CRAN', 'GitHubOnly', 'CRANOnly', 'Both']

d = required.apply(__F, axis=1)
d = d[['mtime'] + fields]
d['mtime'] = pandas.to_datetime(d['mtime'])
d = d.set_index('mtime').sort_index()

d.cumsum().fillna(method='pad').plot(figsize=(15,6))


Out[17]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fa26fe77e10>

In [18]:
d['days'] = (pandas.to_datetime('2015-06-01') - d.index).astype('timedelta64[D]')

for field in fields:
    d['{}D'.format(field)] = d['days'] * d[field]

In [19]:
d[['{}D'.format(field) for field in fields]].plot(kind='box')


Out[19]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fa26fe8ca10>

In [20]:
from scipy.stats import mannwhitneyu
from scipy.stats import norm
from math import sqrt

Ud = {}
zd = {}
pd = {}

for field1 in fields:
    for field2 in fields:
        d1, d2 = d[field1+'D'].dropna(), d[field2+'D'].dropna()
        n1, n2 = len(d1), len(d2)
        u, p = mannwhitneyu(d1, d2, use_continuity=False)
        Ud.setdefault(field1, {})[field2] = u        
        zd.setdefault(field1, {})[field2] = (u - (n1 * n2) / 2.0) / sqrt(n1 * n2 * (n1 + n2 + 1) / 12.0)
        pd.setdefault(field1, {})[field2] = 2 * norm.cdf(zd[field1][field2])

In [21]:
pandas.DataFrame.from_dict(Ud, orient='index').sort_index().sort_index(axis=1)


Out[21]:
Both CRAN CRANOnly GitHub GitHubOnly
Both 425964.5 689044.0 263079.5 517238.0 91273.5
CRAN 689044.0 1313820.5 506681.5 984911.5 177772.5
CRANOnly 263079.5 506681.5 243602.0 434142.5 86499.0
GitHub 517238.0 984911.5 434142.5 834632.0 159354.0
GitHubOnly 91273.5 177772.5 86499.0 159354.0 68080.5

Care: p-value must be compared to a global $\alpha$. Under $\alpha=0.05$, the p-value has to be multiplied by 10 here (because we only look at the lower part of the matrix)


In [22]:
pandas.DataFrame.from_dict(pd, orient='index').sort_index().sort_index(axis=1)


Out[22]:
Both CRAN CRANOnly GitHub GitHubOnly
Both 1.000000e+00 9.171233e-04 2.490240e-10 1.010140e-07 6.865672e-39
CRAN 9.171233e-04 1.000000e+00 6.542091e-05 5.770613e-03 4.152996e-34
CRANOnly 2.490240e-10 6.542091e-05 1.000000e+00 1.704979e-01 1.035604e-18
GitHub 1.010140e-07 5.770613e-03 1.704979e-01 1.000000e+00 2.370454e-22
GitHubOnly 6.865672e-39 4.152996e-34 1.035604e-18 2.370454e-22 1.000000e+00

In [23]:
pandas.DataFrame.from_dict(zd, orient='index').sort_index().sort_index(axis=1)


Out[23]:
Both CRAN CRANOnly GitHub GitHubOnly
Both 0.000000 -3.314789 -6.327587 -5.324890 -13.044107
CRAN -3.314789 0.000000 -3.992353 -2.760539 -12.176382
CRANOnly -6.327587 -3.992353 0.000000 -1.370606 -8.831198
GitHub -5.324890 -2.760539 -1.370606 0.000000 -9.724509
GitHubOnly -13.044107 -12.176382 -8.831198 -9.724509 0.000000